suppressPackageStartupMessages(library(tidyverse))
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
devtools::load_all('~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities
Settings
data_dir <- '/Volumes/Mitsu_NGS_3/METTL2A/'
wd <- "~/Google Drive/My Drive/Analysis/METTL2A/"
setwd(wd)
figdir <- paste0(wd, 'Figures/DRS_m3C_RNAs/Parameters/')
tabledir <- paste0(wd, 'Tables/Espresso/')
theme_set(
theme_classic(base_size = 7) +
theme(legend.position = 'bottom')
)
Functions
paste_wd <- function(path) {
paste0(wd, path)
}
read_readcounts_Espresso_unspliced <- function(path) {
read_delim(
path,
delim = ' ', col_names = c('count', 'transcript_id')
) |>
mutate(basename = basename(path))
}
add_yrange <- function(df) {
new_df <- df |>
mutate(ymax = cumsum(percentage / 100))
new_df$ymin <- c(0, head(new_df$ymax, n = -1))
return(new_df)
}
donutplot <- function(df, .var) {
df |>
add_yrange() |>
ggplot(aes(
xmin = 2, xmax = 4, ymin = ymin, ymax = ymax,
fill = {{ .var }}, colour = {{ .var }}
)) +
geom_rect() +
coord_polar(theta = 'y') +
ggrepel::geom_text_repel(
aes(label = {{ .var }}, y = (ymin + ymax) / 2), x = 1
) +
xlim(c(-1,4)) +
scale_fill_manual(values = c('blue', 'red')) +
scale_color_manual(values = c('blue', 'red')) +
theme_void()
}
Read data
espresso_deseq2 <-
read_tsv(
'Tables/Espresso/espresso_deseq2_genetype2_isDET_2024-04-18.tsv' |> paste_wd()
)
## Rows: 36717 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (11): transcript_id, transcript_type, transcript_name, gene_id, gene_typ...
## dbl (18): siMETTL2A_baseMean, siMETTL2A_log2FoldChange, siMETTL2A_lfcSE, siM...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
espresso_deseq2
## # A tibble: 36,717 × 29
## transcript_id transcript_type transcript_name gene_id gene_type gene_name
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 ENST00000498442.1 retained_intron CRBN-212 ENSG00… protein_… CRBN
## 2 ENST00000459840.5 retained_intron CRBN-205 ENSG00… protein_… CRBN
## 3 ENST00000231948.9 protein_coding CRBN-201 ENSG00… protein_… CRBN
## 4 ENST00000432408.6 protein_coding CRBN-203 ENSG00… protein_… CRBN
## 5 ENST00000339437.… protein_coding TRNT1-203 ENSG00… protein_… TRNT1
## 6 ENST00000488263.5 retained_intron CRBN-209 ENSG00… protein_… CRBN
## 7 ENST00000420393.5 protein_coding TRNT1-207 ENSG00… protein_… TRNT1
## 8 ENST00000698415.1 retained_intron TRNT1-230 ENSG00… protein_… TRNT1
## 9 ENST00000450014.1 protein_coding CRBN-204 ENSG00… protein_… CRBN
## 10 ENST00000698416.1 retained_intron TRNT1-231 ENSG00… protein_… TRNT1
## # ℹ 36,707 more rows
## # ℹ 23 more variables: siMETTL2A_baseMean <dbl>,
## # siMETTL2A_log2FoldChange <dbl>, siMETTL2A_lfcSE <dbl>,
## # siMETTL2A_stat <dbl>, siMETTL2A_pvalue <dbl>, siMETTL2A_padj <dbl>,
## # siMETTL2A_I_baseMean <dbl>, siMETTL2A_I_log2FoldChange <dbl>,
## # siMETTL2A_I_lfcSE <dbl>, siMETTL2A_I_stat <dbl>, siMETTL2A_I_pvalue <dbl>,
## # siMETTL2A_I_padj <dbl>, siMETTL2A_G_baseMean <dbl>, …
methylated_positions <-
read_tsv(
'Tables/DRS_m3C_sites/DRS_methylated_positions_relative_range_2024-04-22.tsv' |> paste_wd()
)
## Rows: 489 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): transcript_id, gene_name, seqname, gene_type, ref_kmer, genetype2
## dbl (7): kmer_start, kmer_end, kmer_middle, length, rel_kmer_start, rel_kmer...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
methylated_positions
## # A tibble: 489 × 13
## transcript_id gene_name seqname gene_type ref_kmer kmer_start kmer_end
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 ENST00000429711.7 RPL32 chr3 protein_cod… GCCCA 423 427
## 2 ENST00000647248.2 RPL35A chr3 protein_cod… ACCCC 381 385
## 3 ENST00000647248.2 RPL35A chr3 protein_cod… CCCCT 382 386
## 4 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA CCCCG 58 62
## 5 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA ACCCT 76 80
## 6 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA ATCAA 94 98
## 7 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA GCCAC 149 153
## 8 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA ACCCC 154 158
## 9 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA CCCCC 155 159
## 10 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA CCCCA 156 160
## # ℹ 479 more rows
## # ℹ 6 more variables: kmer_middle <dbl>, genetype2 <chr>, length <dbl>,
## # rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>
methylated_RNAs <-
methylated_positions |>
select(starts_with('transcript_'), starts_with('gene_')) |>
distinct()
methylated_RNAs
## # A tibble: 71 × 3
## transcript_id gene_name gene_type
## <chr> <chr> <chr>
## 1 ENST00000429711.7 RPL32 protein_coding
## 2 ENST00000647248.2 RPL35A protein_coding
## 3 ENST00000389680.2 MT-RNR1 Mt_rRNA
## 4 ENST00000361390.2 MT-ND1 protein_coding
## 5 ENST00000361453.3 MT-ND2 protein_coding
## 6 ENST00000387347.2 MT-RNR2 Mt_rRNA
## 7 ENST00000361624.2 MT-CO1 protein_coding
## 8 ENST00000361739.1 MT-CO2 protein_coding
## 9 ENST00000361899.2 MT-ATP6 protein_coding
## 10 ENST00000361227.2 MT-ND3 protein_coding
## # ℹ 61 more rows
deseq2_normcount <-
read_tsv('Tables/Espresso/espresso_DESeq2_normcount__2024-05-21.tsv.gz' |> paste_wd())
## Rows: 36717 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): transcript_id, transcript_type, transcript_name, gene_id, gene_type...
## dbl (9): siMETTL2A_I_N1, siMETTL2A_I_N2, siMETTL2A_I_N3, siMETTL2A_G_N1, siM...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
deseq2_normcount
## # A tibble: 36,717 × 16
## transcript_id transcript_type transcript_name gene_id gene_type gene_name
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 ENST00000498442.1 retained_intron CRBN-212 ENSG00… protein_… CRBN
## 2 ENST00000459840.5 retained_intron CRBN-205 ENSG00… protein_… CRBN
## 3 ENST00000231948.9 protein_coding CRBN-201 ENSG00… protein_… CRBN
## 4 ENST00000432408.6 protein_coding CRBN-203 ENSG00… protein_… CRBN
## 5 ENST00000339437.… protein_coding TRNT1-203 ENSG00… protein_… TRNT1
## 6 ENST00000488263.5 retained_intron CRBN-209 ENSG00… protein_… CRBN
## 7 ENST00000420393.5 protein_coding TRNT1-207 ENSG00… protein_… TRNT1
## 8 ENST00000698415.1 retained_intron TRNT1-230 ENSG00… protein_… TRNT1
## 9 ENST00000450014.1 protein_coding CRBN-204 ENSG00… protein_… CRBN
## 10 ENST00000698416.1 retained_intron TRNT1-231 ENSG00… protein_… TRNT1
## # ℹ 36,707 more rows
## # ℹ 10 more variables: siMETTL2A_I_N1 <dbl>, siMETTL2A_I_N2 <dbl>,
## # siMETTL2A_I_N3 <dbl>, siMETTL2A_G_N1 <dbl>, siMETTL2A_G_N2 <dbl>,
## # siMETTL2A_G_N3 <dbl>, Cont_D_N1 <dbl>, Cont_D_N2 <dbl>, Cont_D_N3 <dbl>,
## # seqname <chr>
cpms <-
read_tsv(
'Tables/DRS_quantification/espresso_quantification_cpm_2024-04-19.tsv.gz' |> paste_wd()
)
## Rows: 330453 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (14): transcript_id, transcript_name, gene_id, type, si, seqname, source...
## dbl (6): rep, count, total_reads, cpm, start, end
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cpms
## # A tibble: 330,453 × 20
## transcript_id transcript_name gene_id type si rep count total_reads
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 ENST00000498442.1 CRBN-212 ENSG00… siME… I 1 0 3552783
## 2 ENST00000498442.1 CRBN-212 ENSG00… siME… I 2 1 997879
## 3 ENST00000498442.1 CRBN-212 ENSG00… siME… I 3 0 2778705
## 4 ENST00000498442.1 CRBN-212 ENSG00… siME… G 1 0 3497396
## 5 ENST00000498442.1 CRBN-212 ENSG00… siME… G 2 0 3810844
## 6 ENST00000498442.1 CRBN-212 ENSG00… siME… G 3 0 3668094
## 7 ENST00000498442.1 CRBN-212 ENSG00… Cont D 1 1 2701773
## 8 ENST00000498442.1 CRBN-212 ENSG00… Cont D 2 1 3406597
## 9 ENST00000498442.1 CRBN-212 ENSG00… Cont D 3 0 3653792
## 10 ENST00000459840.5 CRBN-205 ENSG00… siME… I 1 1.08 3552783
## # ℹ 330,443 more rows
## # ℹ 12 more variables: cpm <dbl>, seqname <chr>, source <chr>, feature <chr>,
## # start <dbl>, end <dbl>, score <chr>, strand <chr>, frame <chr>,
## # gene_type <chr>, gene_name <chr>, transcript_type <chr>
readcount_espressso_unspliced <-
fs::dir_ls(
'/Volumes/Mitsu_NGS_2/METTL2A/Alignment/Minimap2/Espresso_unspliced/',
glob = '*_count.txt'
) |>
map(read_readcounts_Espresso_unspliced) |>
reduce(bind_rows)
## Rows: 26582 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: " "
## chr (1): transcript_id
## dbl (1): count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 20396 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: " "
## chr (1): transcript_id
## dbl (1): count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 24993 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: " "
## chr (1): transcript_id
## dbl (1): count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 25198 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: " "
## chr (1): transcript_id
## dbl (1): count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 25097 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: " "
## chr (1): transcript_id
## dbl (1): count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 24350 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: " "
## chr (1): transcript_id
## dbl (1): count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 25403 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: " "
## chr (1): transcript_id
## dbl (1): count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 25027 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: " "
## chr (1): transcript_id
## dbl (1): count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 25832 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: " "
## chr (1): transcript_id
## dbl (1): count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
readcount_espressso_unspliced
## # A tibble: 222,878 × 3
## count transcript_id basename
## <dbl> <chr> <chr>
## 1 266 ENST00000000233.10 221117_DrTaniue_7_siMETTL2A_I_N1_count.txt
## 2 29 ENST00000000412.8 221117_DrTaniue_7_siMETTL2A_I_N1_count.txt
## 3 34 ENST00000000442.11 221117_DrTaniue_7_siMETTL2A_I_N1_count.txt
## 4 114 ENST00000001008.6 221117_DrTaniue_7_siMETTL2A_I_N1_count.txt
## 5 12 ENST00000002125.9 221117_DrTaniue_7_siMETTL2A_I_N1_count.txt
## 6 120 ENST00000002165.11 221117_DrTaniue_7_siMETTL2A_I_N1_count.txt
## 7 2 ENST00000002501.11 221117_DrTaniue_7_siMETTL2A_I_N1_count.txt
## 8 19 ENST00000002596.6 221117_DrTaniue_7_siMETTL2A_I_N1_count.txt
## 9 178 ENST00000003100.13 221117_DrTaniue_7_siMETTL2A_I_N1_count.txt
## 10 2 ENST00000003583.12 221117_DrTaniue_7_siMETTL2A_I_N1_count.txt
## # ℹ 222,868 more rows
espresso_AsPC1_seqs <-
read_tsv(
'Tables/Database/espresso_AsPC1_transcriptome_seqs_2024-04-22.tsv.gz' |> paste_wd()
)
## Rows: 36717 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): transcript_id, transcript_seq
## dbl (1): transcript_length
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
espresso_AsPC1_seqs
## # A tibble: 36,717 × 3
## transcript_id transcript_seq transcript_length
## <chr> <chr> <dbl>
## 1 ENST00000339437.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCA… 987
## 2 ENST00000251607.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCA… 2252
## 3 ENST00000420393.5 CAGCGGGGCCGGTAAGCGGGCGCGCGCCGCTCAGAGGGG… 854
## 4 ENST00000698415.1 GATGTATGATGAGTTTAGTTGAATGCTCGTGTTGCTGTC… 6597
## 5 ENST00000698416.1 CATGACTAGTTTTGTGGGTAGCAATGATGTTTAAATGTC… 5500
## 6 ENST00000488263.5 AGGAACTTCATCATGAAGTCTCAAGTAAACGAACATTTT… 4528
## 7 ENST00000424814.5 GAGATCAGCAGGACGCTGCGCACAACATGGGCAACCACC… 2038
## 8 ENST00000231948.9 AGACATGGCCGGCGAAGGAGATCAGCAGGACGCTGCGCA… 2187
## 9 ENST00000432408.6 GCCTCCTTTGCGGGTAAACAGACATGGCCGGCGAAGGAG… 2203
## 10 ENST00000459840.5 ATGGAGGCATTTAAACTGGGACTGAGATGGGACTGAGTG… 723
## # ℹ 36,707 more rows
Calculate average normcount
# deseq2_normcount_mean <-
# deseq2_normcount |>
# select(transcript_id, starts_with('siMETTL2A_'), starts_with('Cont_')) |>
# pivot_longer(cols = -transcript_id) |>
# group_by(transcript_id) |>
# reframe(mean_normcount = mean(value, na.rm = TRUE))
mean_CPMs <-
cpms |>
group_by(transcript_id) |>
reframe(
mean_CPMs = mean(cpm, na.rm = TRUE),
total_counts = sum(count, na.rm = TRUE)
)
mean_CPMs
## # A tibble: 36,717 × 3
## transcript_id mean_CPMs total_counts
## <chr> <dbl> <dbl>
## 1 ENST00000000233.10 46.8 1222.
## 2 ENST00000000412.8 7.60 218
## 3 ENST00000000442.11 7.30 188.
## 4 ENST00000001008.6 16.9 482
## 5 ENST00000002125.9 2.12 48.8
## 6 ENST00000002165.11 21.6 576.
## 7 ENST00000002501.11 0.0703 2
## 8 ENST00000002596.6 1.93 52
## 9 ENST00000003100.13 20.5 584.
## 10 ENST00000003583.12 0.164 4.6
## # ℹ 36,707 more rows
sum_readcount_espresso_unspliced <-
readcount_espressso_unspliced |>
group_by(transcript_id) |>
reframe(
total_read = sum(count, na.rm = TRUE),
min_read = min(count, na.rm = TRUE)
) |>
mutate(mean_read = total_read / 9)
sum_readcount_espresso_unspliced
## # A tibble: 33,173 × 4
## transcript_id total_read min_read mean_read
## <chr> <dbl> <dbl> <dbl>
## 1 ENST00000000233.10 1359 85 151
## 2 ENST00000000412.8 519 8 57.7
## 3 ENST00000000442.11 227 8 25.2
## 4 ENST00000001008.6 809 23 89.9
## 5 ENST00000002125.9 83 4 9.22
## 6 ENST00000002165.11 589 33 65.4
## 7 ENST00000002501.11 7 1 0.778
## 8 ENST00000002596.6 101 1 11.2
## 9 ENST00000003100.13 1417 40 157.
## 10 ENST00000003583.12 23 1 2.56
## # ℹ 33,163 more rows
Calculate GC% …
espresso_AsPC1_RNAinfo <-
espresso_AsPC1_seqs |>
mutate(
num_GC = str_count(transcript_seq, 'G|C'),
num_C = str_count(transcript_seq, 'C'),
num_CC = str_count(transcript_seq, 'CC')
) |>
mutate(
GC_content = num_GC / transcript_length,
C_content = num_C / transcript_length,
CC_content = num_CC / transcript_length
)
espresso_AsPC1_RNAinfo
## # A tibble: 36,717 × 9
## transcript_id transcript_seq transcript_length num_GC num_C num_CC GC_content
## <chr> <chr> <dbl> <int> <int> <int> <dbl>
## 1 ENST00000339… AGCCCGGAAGTGC… 987 406 182 35 0.411
## 2 ENST00000251… AGCCCGGAAGTGC… 2252 838 353 60 0.372
## 3 ENST00000420… CAGCGGGGCCGGT… 854 459 215 53 0.537
## 4 ENST00000698… GATGTATGATGAG… 6597 2365 1105 179 0.358
## 5 ENST00000698… CATGACTAGTTTT… 5500 1892 893 143 0.344
## 6 ENST00000488… AGGAACTTCATCA… 4528 1621 736 126 0.358
## 7 ENST00000424… GAGATCAGCAGGA… 2038 763 368 73 0.374
## 8 ENST00000231… AGACATGGCCGGC… 2187 835 400 80 0.382
## 9 ENST00000432… GCCTCCTTTGCGG… 2203 844 405 82 0.383
## 10 ENST00000459… ATGGAGGCATTTA… 723 282 132 26 0.390
## # ℹ 36,707 more rows
## # ℹ 2 more variables: C_content <dbl>, CC_content <dbl>
espresso_deseq2_m3Cinfo <-
espresso_deseq2 |>
left_join(methylated_RNAs |> mutate(m3C = 'm3C')) |>
replace_na(list(m3C = 'other'))
## Joining with `by = join_by(transcript_id, gene_type, gene_name)`
espresso_deseq2_m3Cinfo |>
export_tsv(outdir = tabledir, compression = 'gz')
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/Espresso/espresso_deseq2_m3Cinfo_2024-07-31.tsv.gz
## # A tibble: 36,717 × 30
## transcript_id transcript_type transcript_name gene_id gene_type gene_name
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 ENST00000498442.1 retained_intron CRBN-212 ENSG00… protein_… CRBN
## 2 ENST00000459840.5 retained_intron CRBN-205 ENSG00… protein_… CRBN
## 3 ENST00000231948.9 protein_coding CRBN-201 ENSG00… protein_… CRBN
## 4 ENST00000432408.6 protein_coding CRBN-203 ENSG00… protein_… CRBN
## 5 ENST00000339437.… protein_coding TRNT1-203 ENSG00… protein_… TRNT1
## 6 ENST00000488263.5 retained_intron CRBN-209 ENSG00… protein_… CRBN
## 7 ENST00000420393.5 protein_coding TRNT1-207 ENSG00… protein_… TRNT1
## 8 ENST00000698415.1 retained_intron TRNT1-230 ENSG00… protein_… TRNT1
## 9 ENST00000450014.1 protein_coding CRBN-204 ENSG00… protein_… CRBN
## 10 ENST00000698416.1 retained_intron TRNT1-231 ENSG00… protein_… TRNT1
## # ℹ 36,707 more rows
## # ℹ 24 more variables: siMETTL2A_baseMean <dbl>,
## # siMETTL2A_log2FoldChange <dbl>, siMETTL2A_lfcSE <dbl>,
## # siMETTL2A_stat <dbl>, siMETTL2A_pvalue <dbl>, siMETTL2A_padj <dbl>,
## # siMETTL2A_I_baseMean <dbl>, siMETTL2A_I_log2FoldChange <dbl>,
## # siMETTL2A_I_lfcSE <dbl>, siMETTL2A_I_stat <dbl>, siMETTL2A_I_pvalue <dbl>,
## # siMETTL2A_I_padj <dbl>, siMETTL2A_G_baseMean <dbl>, …
m3CRNA_transcriptinfo <-
espresso_deseq2_m3Cinfo |>
select(
starts_with('transcript_'), starts_with('gene_'), seqname, genetype2,
m3C, common_DETs
) |>
left_join(sum_readcount_espresso_unspliced) |>
left_join(espresso_AsPC1_RNAinfo)
## Joining with `by = join_by(transcript_id)`
## Joining with `by = join_by(transcript_id)`
m3CRNA_transcriptinfo
## # A tibble: 36,717 × 21
## transcript_id transcript_type transcript_name gene_id gene_type gene_name
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 ENST00000498442.1 retained_intron CRBN-212 ENSG00… protein_… CRBN
## 2 ENST00000459840.5 retained_intron CRBN-205 ENSG00… protein_… CRBN
## 3 ENST00000231948.9 protein_coding CRBN-201 ENSG00… protein_… CRBN
## 4 ENST00000432408.6 protein_coding CRBN-203 ENSG00… protein_… CRBN
## 5 ENST00000339437.… protein_coding TRNT1-203 ENSG00… protein_… TRNT1
## 6 ENST00000488263.5 retained_intron CRBN-209 ENSG00… protein_… CRBN
## 7 ENST00000420393.5 protein_coding TRNT1-207 ENSG00… protein_… TRNT1
## 8 ENST00000698415.1 retained_intron TRNT1-230 ENSG00… protein_… TRNT1
## 9 ENST00000450014.1 protein_coding CRBN-204 ENSG00… protein_… CRBN
## 10 ENST00000698416.1 retained_intron TRNT1-231 ENSG00… protein_… TRNT1
## # ℹ 36,707 more rows
## # ℹ 15 more variables: seqname <chr>, genetype2 <chr>, m3C <chr>,
## # common_DETs <chr>, total_read <dbl>, min_read <dbl>, mean_read <dbl>,
## # transcript_seq <chr>, transcript_length <dbl>, num_GC <int>, num_C <int>,
## # num_CC <int>, GC_content <dbl>, C_content <dbl>, CC_content <dbl>
m3CRNA_transcriptinfo |>
export_tsv(outdir = tabledir, compression = 'gz')
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/Espresso/m3CRNA_transcriptinfo_2024-07-31.tsv.gz
## # A tibble: 36,717 × 21
## transcript_id transcript_type transcript_name gene_id gene_type gene_name
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 ENST00000498442.1 retained_intron CRBN-212 ENSG00… protein_… CRBN
## 2 ENST00000459840.5 retained_intron CRBN-205 ENSG00… protein_… CRBN
## 3 ENST00000231948.9 protein_coding CRBN-201 ENSG00… protein_… CRBN
## 4 ENST00000432408.6 protein_coding CRBN-203 ENSG00… protein_… CRBN
## 5 ENST00000339437.… protein_coding TRNT1-203 ENSG00… protein_… TRNT1
## 6 ENST00000488263.5 retained_intron CRBN-209 ENSG00… protein_… CRBN
## 7 ENST00000420393.5 protein_coding TRNT1-207 ENSG00… protein_… TRNT1
## 8 ENST00000698415.1 retained_intron TRNT1-230 ENSG00… protein_… TRNT1
## 9 ENST00000450014.1 protein_coding CRBN-204 ENSG00… protein_… CRBN
## 10 ENST00000698416.1 retained_intron TRNT1-231 ENSG00… protein_… TRNT1
## # ℹ 36,707 more rows
## # ℹ 15 more variables: seqname <chr>, genetype2 <chr>, m3C <chr>,
## # common_DETs <chr>, total_read <dbl>, min_read <dbl>, mean_read <dbl>,
## # transcript_seq <chr>, transcript_length <dbl>, num_GC <int>, num_C <int>,
## # num_CC <int>, GC_content <dbl>, C_content <dbl>, CC_content <dbl>
Plots
Mean read count
m3CRNA_transcriptinfo |>
rstatix::wilcox_test(mean_read ~ m3C)
## # A tibble: 1 × 7
## .y. group1 group2 n1 n2 statistic p
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl>
## 1 mean_read m3C other 71 33102 2348049 5.31e-48
ecdf_meanread_m3C <-
m3CRNA_transcriptinfo |>
ggplot(aes(x = mean_read, colour = m3C)) +
stat_ecdf(lwd = 1.1) +
geom_vline(xintercept = c(200, 300)) +
scale_x_log10() +
scale_color_manual(values = c('#00998C', '#808080'))
ecdf_meanread_m3C |>
ggsave_multiple_formats(
outdir = figdir, width = 6, height = 4, fontsize = 7
)
## Warning: Removed 3544 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Removed 3544 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Removed 3544 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Removed 3544 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Removed 3544 rows containing non-finite outside the scale range
## (`stat_ecdf()`).

m3CRNA_transcriptinfo |>
filter(m3C == 'm3C') |>
arrange(mean_read) |>
select(transcript_name, mean_read)
## # A tibble: 71 × 2
## transcript_name mean_read
## <chr> <dbl>
## 1 RPLP0-226 283.
## 2 CEACAM6-201 304.
## 3 ATP5F1A-202 328.
## 4 PRELID1-201 363.
## 5 H3-3B-201 365
## 6 MDK-203 471.
## 7 RPLP0-219 531.
## 8 ATP5MJ-201 571.
## 9 TOMM7-201 703
## 10 SH3BGRL3-201 783.
## # ℹ 61 more rows
m3CRNA_transcriptinfo |>
filter(mean_read > 283) |>
group_by(m3C) |>
reframe(n = n())
## # A tibble: 2 × 2
## m3C n
## <chr> <int>
## 1 m3C 71
## 2 other 296
m3CRNA_transcriptinfo |>
filter(mean_read > 283) |>
filter(m3C != 'm3C') |>
arrange(-mean_read) |>
select(transcript_name, mean_read)
## # A tibble: 296 × 2
## transcript_name mean_read
## <chr> <dbl>
## 1 RPS18-236 3026.
## 2 RPL31-201 2547
## 3 TMSB4X-204 2150.
## 4 RPS15A-203 2141.
## 5 RPS27A-201 1727.
## 6 MT-ND5-201 1688.
## 7 RPL23A-204 1676.
## 8 S100A11-201 1619.
## 9 RPS10-209 1529.
## 10 ENST00000423610 1521.
## # ℹ 286 more rows
m3CRNA_transcriptinfo |>
filter(mean_read > 283) |>
#filter(m3C != 'm3C') |>
group_by(m3C, mean_read > 1000) |>
reframe(n = n())
## # A tibble: 4 × 3
## m3C `mean_read > 1000` n
## <chr> <lgl> <int>
## 1 m3C FALSE 14
## 2 m3C TRUE 57
## 3 other FALSE 252
## 4 other TRUE 44
m3CRNA_transcriptinfo |>
filter(mean_read > 100) |>
ggplot(aes(x = mean_read, colour = m3C)) +
stat_ecdf(lwd = 1.1) +
geom_vline(xintercept = c(200, 300)) +
scale_x_log10() +
scale_color_manual(values = c('#00998C', '#808080'))

Mean read count
m3CRNA_transcriptinfo |>
rstatix::wilcox_test(min_read ~ m3C)
## # A tibble: 1 × 7
## .y. group1 group2 n1 n2 statistic p
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl>
## 1 min_read m3C other 71 33102 2348102. 2.8e-59
ecdf_minread_m3C <-
m3CRNA_transcriptinfo |>
ggplot(aes(x = min_read, colour = m3C)) +
stat_ecdf(lwd = 1.1) +
geom_vline(xintercept = c(100)) +
scale_x_log10() +
scale_color_manual(values = c('#00998C', '#808080'))
ecdf_minread_m3C |>
ggsave_multiple_formats(
outdir = figdir, width = 6, height = 4, fontsize = 7
)
## Warning: Removed 3544 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Removed 3544 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Removed 3544 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Removed 3544 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Removed 3544 rows containing non-finite outside the scale range
## (`stat_ecdf()`).

m3CRNA_transcriptinfo |>
filter(min_read > 100) |>
group_by(m3C) |>
reframe(n = n())
## # A tibble: 2 × 2
## m3C n
## <chr> <int>
## 1 m3C 71
## 2 other 330
num_m3Csites <-
methylated_positions |>
group_by(transcript_id) |>
reframe(num_m3Csites = n())
num_m3Csites
## # A tibble: 71 × 2
## transcript_id num_m3Csites
## <chr> <int>
## 1 ENST00000009589.8 1
## 2 ENST00000199764.7 1
## 3 ENST00000202773.14 2
## 4 ENST00000215754.8 4
## 5 ENST00000229239.10 2
## 6 ENST00000230050.4 4
## 7 ENST00000233143.6 15
## 8 ENST00000234875.9 2
## 9 ENST00000243997.8 3
## 10 ENST00000254810.8 1
## # ℹ 61 more rows
correlation_minread_m3Cfraction <-
m3CRNA_transcriptinfo |>
filter(min_read > 10) |>
left_join(num_m3Csites) |>
replace_na(list(num_m3Csites = 0)) |>
ggplot(aes(x = min_read , y = num_m3Csites / num_C)) +
geom_hex(bins = 50) +
scale_x_log10() +
scale_fill_viridis_c(trans = 'log10') +
geom_vline(xintercept = c(100), color = 'gray20')
## Joining with `by = join_by(transcript_id)`
correlation_minread_m3Cfraction |>
ggsave_multiple_formats(
outdir = figdir, width = 6, height = 6, fontsize = 7
)

nanocompore_wellexpressed <-
m3CRNA_transcriptinfo |>
filter(min_read > 100) |>
select(transcript_id) |>
distinct() |>
left_join(
read_tsv(
'Tables/DRS_m3C_sites/sampcomp_results_joined_2024-04-24.tsv.gz' |> paste_wd()
),
by = join_by(transcript_id)
) |>
filter(!is.na(intensity_up))
## Rows: 5884004 Columns: 67
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (34): transcript_id, transcript_name, ref_kmer, GMM_cov_type_G, cluster_...
## dbl (33): position, GMM_logit_pvalue_G, KS_dwell_pvalue_G, KS_intensity_pval...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nanocompore_wellexpressed
## # A tibble: 397,223 × 67
## transcript_id transcript_name position ref_kmer GMM_logit_pvalue_G
## <chr> <chr> <dbl> <chr> <dbl>
## 1 ENST00000429711.7 RPL32-204 30 TCCTC NA
## 2 ENST00000429711.7 RPL32-204 31 CCTCG 1
## 3 ENST00000429711.7 RPL32-204 32 CTCGG 1
## 4 ENST00000429711.7 RPL32-204 33 TCGGC 1
## 5 ENST00000429711.7 RPL32-204 34 CGGCG 1
## 6 ENST00000429711.7 RPL32-204 35 GGCGC 1
## 7 ENST00000429711.7 RPL32-204 36 GCGCT 1
## 8 ENST00000429711.7 RPL32-204 37 CGCTG 1
## 9 ENST00000429711.7 RPL32-204 38 GCTGC 1
## 10 ENST00000429711.7 RPL32-204 39 CTGCC 1
## # ℹ 397,213 more rows
## # ℹ 62 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## # GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## # Logit_LOR_G <chr>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## # c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## # c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## # c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …
nanocompore_wellexpressed |>
group_by(intensity_up, middle_isC) |>
reframe(n = n()) |>
mutate(percent = 100 * n /sum(n))
## # A tibble: 8 × 4
## intensity_up middle_isC n percent
## <chr> <chr> <int> <dbl>
## 1 common C 489 0.123
## 2 common others 113 0.0284
## 3 only G C 1422 0.358
## 4 only G others 802 0.202
## 5 only I C 265 0.0667
## 6 only I others 168 0.0423
## 7 others C 93755 23.6
## 8 others others 300209 75.6
% of RNAs with intensity up sites or m3C sites
Calculate
num_m3C_intensityup_wellexpressed <-
nanocompore_wellexpressed |>
group_by(transcript_id) |>
reframe(
num_intensityup = sum(intensity_up == 'common', na.rm = TRUE),
num_m3C = sum(intensity_up == 'common' & middle_isC == 'C', na.rm = TRUE)
) |>
mutate(
have_intensityup = ifelse(num_intensityup > 0, 'yes', 'no'),
have_m3C = ifelse(num_m3C > 0, 'yes', 'no')
)
num_m3C_intensityup_wellexpressed
## # A tibble: 400 × 5
## transcript_id num_intensityup num_m3C have_intensityup have_m3C
## <chr> <int> <int> <chr> <chr>
## 1 ENST00000007516.8 0 0 no no
## 2 ENST00000009180.10 0 0 no no
## 3 ENST00000009589.8 1 1 yes yes
## 4 ENST00000027335.8 0 0 no no
## 5 ENST00000175091.5 0 0 no no
## 6 ENST00000184266.3 0 0 no no
## 7 ENST00000196551.8 0 0 no no
## 8 ENST00000199764.7 1 1 yes yes
## 9 ENST00000202773.14 3 2 yes yes
## 10 ENST00000215754.8 4 4 yes yes
## # ℹ 390 more rows
percent_have_intensityup_wellexpressed <-
num_m3C_intensityup_wellexpressed |>
group_by(have_intensityup) |>
reframe(n = n()) |>
mutate(percentage = 100 * n / sum(n))
percent_have_intensityup_wellexpressed
## # A tibble: 2 × 3
## have_intensityup n percentage
## <chr> <int> <dbl>
## 1 no 318 79.5
## 2 yes 82 20.5
percent_have_m3C_wellexpressed <-
num_m3C_intensityup_wellexpressed |>
group_by(have_m3C) |>
reframe(n = n()) |>
mutate(percentage = 100 * n / sum(n))
percent_have_m3C_wellexpressed
## # A tibble: 2 × 3
## have_m3C n percentage
## <chr> <int> <dbl>
## 1 no 329 82.2
## 2 yes 71 17.8
Donut plot
donutplot_have_intensityup <-
percent_have_intensityup_wellexpressed |>
donutplot(have_intensityup)
donutplot_have_intensityup |>
ggsave_multiple_formats(
width = 5, height = 5, fontsize = 7, outdir = figdir
)

donutplot_have_m3C <-
percent_have_m3C_wellexpressed |>
donutplot(have_m3C)
donutplot_have_m3C |>
ggsave_multiple_formats(
width = 5, height = 5, fontsize = 7, outdir = figdir
)

% of sites with intensity up
nanocompore_wellexpressed |>
filter(!is.na(intensity_up))
## # A tibble: 397,223 × 67
## transcript_id transcript_name position ref_kmer GMM_logit_pvalue_G
## <chr> <chr> <dbl> <chr> <dbl>
## 1 ENST00000429711.7 RPL32-204 30 TCCTC NA
## 2 ENST00000429711.7 RPL32-204 31 CCTCG 1
## 3 ENST00000429711.7 RPL32-204 32 CTCGG 1
## 4 ENST00000429711.7 RPL32-204 33 TCGGC 1
## 5 ENST00000429711.7 RPL32-204 34 CGGCG 1
## 6 ENST00000429711.7 RPL32-204 35 GGCGC 1
## 7 ENST00000429711.7 RPL32-204 36 GCGCT 1
## 8 ENST00000429711.7 RPL32-204 37 CGCTG 1
## 9 ENST00000429711.7 RPL32-204 38 GCTGC 1
## 10 ENST00000429711.7 RPL32-204 39 CTGCC 1
## # ℹ 397,213 more rows
## # ℹ 62 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## # GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## # Logit_LOR_G <chr>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## # c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## # c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## # c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …
percent_intensityup_sites_wellexpressed <-
nanocompore_wellexpressed |>
mutate(common_intensity_up = intensity_up == 'common') |>
group_by(common_intensity_up) |>
reframe(n = n()) |>
mutate(percentage = 100 * n / sum(n))
percent_intensityup_sites_wellexpressed
## # A tibble: 2 × 3
## common_intensity_up n percentage
## <lgl> <int> <dbl>
## 1 FALSE 396621 99.8
## 2 TRUE 602 0.152
donutplot_intensityup_sites <-
percent_intensityup_sites_wellexpressed |>
donutplot(common_intensity_up)
donutplot_intensityup_sites |>
ggsave_multiple_formats(
width = 5, height = 5, fontsize = 7, outdir = figdir
)

Length
m3CRNA_transcriptinfo |>
rstatix::wilcox_test(transcript_length ~ m3C)
## # A tibble: 1 × 7
## .y. group1 group2 n1 n2 statistic p
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl>
## 1 transcript_length m3C other 71 36646 975083 0.00026
ecdf_RNAlength_m3C <-
m3CRNA_transcriptinfo |>
ggplot(aes(x = transcript_length, colour = m3C)) +
stat_ecdf(lwd = 1.1) +
scale_x_log10() +
scale_color_manual(values = c('#00998C', '#808080'))
ecdf_RNAlength_m3C |>
ggsave_multiple_formats(
outdir = figdir, width = 6, height = 4, fontsize = 7
)

# m3CRNA_transcriptinfo |>
# filter(genetype2 == 'mRNA') |>
# rstatix::wilcox_test(mean_normcount ~ m3C)
ecdf_mRNAlength_m3C <-
m3CRNA_transcriptinfo |>
filter(genetype2 == 'mRNA') |>
ggplot(aes(x = transcript_length, colour = m3C)) +
stat_ecdf(lwd = 1.1) +
scale_x_log10() +
scale_color_manual(values = c('#00998C', '#808080'))
ecdf_mRNAlength_m3C |>
ggsave_multiple_formats(
outdir = figdir, width = 6, height = 4, fontsize = 7
)

C content
m3CRNA_transcriptinfo |>
rstatix::wilcox_test(C_content ~ m3C)
## # A tibble: 1 × 7
## .y. group1 group2 n1 n2 statistic p
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl>
## 1 C_content m3C other 71 36646 1524682. 0.0122
ecdf_Ccontent_m3C <-
m3CRNA_transcriptinfo |>
ggplot(aes(x = C_content, colour = m3C)) +
stat_ecdf(lwd = 1.1) +
scale_color_manual(values = c('#00998C', '#808080'))
ecdf_Ccontent_m3C |>
ggsave_multiple_formats(
outdir = figdir, width = 6, height = 4, fontsize = 7
)

CC content
m3CRNA_transcriptinfo |>
rstatix::wilcox_test(CC_content ~ m3C)
## # A tibble: 1 × 7
## .y. group1 group2 n1 n2 statistic p
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl>
## 1 CC_content m3C other 71 36646 1521939 0.0133
ecdf_CCcontent_m3C <-
m3CRNA_transcriptinfo |>
ggplot(aes(x = CC_content, colour = m3C)) +
stat_ecdf(lwd = 1.1) +
scale_color_manual(values = c('#00998C', '#808080'))
ecdf_CCcontent_m3C |>
ggsave_multiple_formats(
outdir = figdir, width = 6, height = 4, fontsize = 7
)
